import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patheffects as path_effects
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
from textblob import TextBlob
import altair as alt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
netflix_tiles= pd.read_csv("C:\\Users\\Prashant\\Desktop\\Case Studies\\Python\\Netflix\\netflix.csv")
netflix=netflix_tiles.fillna(0)
netflix.describe()
counts = netflix["type"].value_counts()
fig= px.bar(counts,title='Count of Shows and Movies on Netflix',
color_discrete_sequence=px.colors.cyclical.HSV,template='plotly_dark')
fig.show()
fig= px.pie(counts, values='type',title='Distribution of Show Ratings on Netflix',
color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
netflix_shows=netflix[netflix['type']=='TV Show']
netflix_shows
oldest_series=netflix_shows.sort_values(by='release_year')[0:10]
oldest_series
latest_series=netflix_shows.sort_values(by='release_year', ascending=False)[0:50]
latest_series
netflix_movies=netflix[netflix['type']=='Movie']
netflix_movies
oldest_movies=netflix_movies.sort_values(by='release_year')[0:10]
oldest_movies
latest_movies=netflix_movies.sort_values(by='release_year', ascending=False)[0:50]
latest_movies
netflix_date = netflix_tiles[['date_added']].dropna()
netflix_date['year'] = netflix_date['date_added'].apply(lambda x : x.split(', ')[-1])
netflix_date['month'] = netflix_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df = netflix_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
fig=px.imshow(df,color_continuous_scale=px.colors.sequential.Reds,aspect='auto',template='plotly_dark',width=1000,height=1000,title='Netflix Content Update')
fig.show()
yearly_content=netflix[['type','release_year']]
yearly_content=yearly_content.rename(columns={"release_year": "Release Year"})
yc=yearly_content.groupby(['Release Year','type']).size().reset_index(name='Total Content')
yc=yc[yc['Release Year']>=2010]
fig= px.line(yc, x="Release Year", y="Total Content",color='type', title='Trend of content produced over the years on Netflix',color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark',width=1000,height=400)
fig.show()
fig = px.violin(netflix_shows, x='release_year',color_discrete_sequence=px.colors.cyclical.HSV,template='plotly_dark',points='all',title='Year-wise Analysis of TV Shows')
fig.show()
fig=px.violin(netflix_movies, x='release_year',color_discrete_sequence=px.colors.cyclical.HSV,template='plotly_dark',points='all',title='Year-wise Analysis of Movies')
fig.show()
year_data = netflix['release_year'].value_counts().sort_index().loc[:2019]
type_data = netflix.groupby('type')['release_year'].value_counts().sort_index().unstack().fillna(0).T.loc[:2019]
fig, ax = plt.subplots(1,1, figsize=(28, 15))
ax.plot(year_data.index, year_data, color="maroon", linewidth=5, label='Total', path_effects=[path_effects.SimpleLineShadow(),
path_effects.Normal()])
ax.plot(type_data.index, type_data['Movie'], color='red', linewidth=5, label='Movie', path_effects=[path_effects.SimpleLineShadow(),
path_effects.Normal()])
ax.plot(type_data.index, type_data['TV Show'], color='salmon', linewidth=5, label='TV Show', path_effects=[path_effects.SimpleLineShadow(),
path_effects.Normal()])
ax.set_xlim(2006, 2020)
ax.set_ylim(-40, 2700)
t = [
2008,
2010.8,
2012.1,
2013.1,
2015.7,
2016.1,
2016.9
]
events = [
"Launch Streaming Video\n2007.1",
"Expanding Streaming Service\nStarting with Candata | 2010.11",
"Expanding to Europe\n2012.1",
"First Original Content\n2013.2",
"Expanding to Japan\n2015.9",
"Original targeting Kids\n2016/1",
"Offline Playback Features to all of Users\n2016/11"
]
up_down = [100, 110, 280, 110, 0, 0, 0]
left_right = [ -1, 0, 0, 0, 1, 1, 1.6 ]
for t_i, event_i, ud_i, lr_i in zip(t, events, up_down, left_right):
ax.annotate(event_i,
xy=(t_i + lr_i, year_data[int(t_i)] * (int(t_i+1)-t_i) + year_data[int(t_i)+1] * (t_i-int(t_i)) + ud_i),
xytext=(0,0), textcoords='offset points',
va="center", ha="center",
color="w", fontsize=16,
bbox=dict(boxstyle='round4', pad=0.5, color='#303030', alpha=0.90))
ax.scatter(t_i, year_data[int(t_i)] * (int(t_i+1)-t_i) + year_data[int(t_i)+1] * (t_i-int(t_i)), color='#E50914', s=300)
ax.set_facecolor((0.4, 0.4, 0.4))
ax.set_title("Why Netflix's Contents Count Soared?", position=(0.23, 1.0+0.03), fontsize=30, fontweight='bold')
ax.yaxis.set_tick_params(labelsize=20)
ax.xaxis.set_tick_params(labelsize=20)
plt.legend(loc='upper left', fontsize=20)
plt.figure(figsize=(1,1))
plt.show()
monthly_content=pd.DatetimeIndex(netflix_shows.date_added).month.value_counts().sort_index()
order=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
fig= px.histogram(monthly_content,x=order,y=monthly_content.values,
title='Distribution of Netflix Content On Months',color='date_added',
color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
netflix['weekday'] = pd.DatetimeIndex(netflix.date_added).weekday
daily_content=netflix.weekday.value_counts().sort_index()
order=['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
fig= px.histogram(daily_content,x=order,y=daily_content.values,
title='Distribution of Netflix Content On Days Of Week',color='weekday',
color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
country_codes = {'afghanistan': 'AFG',
'albania': 'ALB',
'algeria': 'DZA',
'american samoa': 'ASM',
'andorra': 'AND',
'angola': 'AGO',
'anguilla': 'AIA',
'antigua and barbuda': 'ATG',
'argentina': 'ARG',
'armenia': 'ARM',
'aruba': 'ABW',
'australia': 'AUS',
'austria': 'AUT',
'azerbaijan': 'AZE',
'bahamas': 'BHM',
'bahrain': 'BHR',
'bangladesh': 'BGD',
'barbados': 'BRB',
'belarus': 'BLR',
'belgium': 'BEL',
'belize': 'BLZ',
'benin': 'BEN',
'bermuda': 'BMU',
'bhutan': 'BTN',
'bolivia': 'BOL',
'bosnia and herzegovina': 'BIH',
'botswana': 'BWA',
'brazil': 'BRA',
'british virgin islands': 'VGB',
'brunei': 'BRN',
'bulgaria': 'BGR',
'burkina faso': 'BFA',
'burma': 'MMR',
'burundi': 'BDI',
'cabo verde': 'CPV',
'cambodia': 'KHM',
'cameroon': 'CMR',
'canada': 'CAN',
'cayman islands': 'CYM',
'central african republic': 'CAF',
'chad': 'TCD',
'chile': 'CHL',
'china': 'CHN',
'colombia': 'COL',
'comoros': 'COM',
'congo democratic': 'COD',
'Congo republic': 'COG',
'cook islands': 'COK',
'costa rica': 'CRI',
"cote d'ivoire": 'CIV',
'croatia': 'HRV',
'cuba': 'CUB',
'curacao': 'CUW',
'cyprus': 'CYP',
'czech republic': 'CZE',
'denmark': 'DNK',
'djibouti': 'DJI',
'dominica': 'DMA',
'dominican republic': 'DOM',
'ecuador': 'ECU',
'egypt': 'EGY',
'el salvador': 'SLV',
'equatorial guinea': 'GNQ',
'eritrea': 'ERI',
'estonia': 'EST',
'ethiopia': 'ETH',
'falkland islands': 'FLK',
'faroe islands': 'FRO',
'fiji': 'FJI',
'finland': 'FIN',
'france': 'FRA',
'french polynesia': 'PYF',
'gabon': 'GAB',
'gambia, the': 'GMB',
'georgia': 'GEO',
'germany': 'DEU',
'ghana': 'GHA',
'gibraltar': 'GIB',
'greece': 'GRC',
'greenland': 'GRL',
'grenada': 'GRD',
'guam': 'GUM',
'guatemala': 'GTM',
'guernsey': 'GGY',
'guinea-bissau': 'GNB',
'guinea': 'GIN',
'guyana': 'GUY',
'haiti': 'HTI',
'honduras': 'HND',
'hong kong': 'HKG',
'hungary': 'HUN',
'iceland': 'ISL',
'india': 'IND',
'indonesia': 'IDN',
'iran': 'IRN',
'iraq': 'IRQ',
'ireland': 'IRL',
'isle of man': 'IMN',
'israel': 'ISR',
'italy': 'ITA',
'jamaica': 'JAM',
'japan': 'JPN',
'jersey': 'JEY',
'jordan': 'JOR',
'kazakhstan': 'KAZ',
'kenya': 'KEN',
'kiribati': 'KIR',
'north korea': 'PRK',
'south korea': 'KOR',
'kosovo': 'KSV',
'kuwait': 'KWT',
'kyrgyzstan': 'KGZ',
'laos': 'LAO',
'latvia': 'LVA',
'lebanon': 'LBN',
'lesotho': 'LSO',
'liberia': 'LBR',
'libya': 'LBY',
'liechtenstein': 'LIE',
'lithuania': 'LTU',
'luxembourg': 'LUX',
'macau': 'MAC',
'macedonia': 'MKD',
'madagascar': 'MDG',
'malawi': 'MWI',
'malaysia': 'MYS',
'maldives': 'MDV',
'mali': 'MLI',
'malta': 'MLT',
'marshall islands': 'MHL',
'mauritania': 'MRT',
'mauritius': 'MUS',
'mexico': 'MEX',
'micronesia': 'FSM',
'moldova': 'MDA',
'monaco': 'MCO',
'mongolia': 'MNG',
'montenegro': 'MNE',
'morocco': 'MAR',
'mozambique': 'MOZ',
'namibia': 'NAM',
'nepal': 'NPL',
'netherlands': 'NLD',
'new cdonia': 'NCL',
'new zealand': 'NZL',
'nicaragua': 'NIC',
'nigeria': 'NGA',
'niger': 'NER',
'niue': 'NIU',
'northern mariana islands': 'MNP',
'norway': 'NOR',
'oman': 'OMN',
'pakistan': 'PAK',
'palau': 'PLW',
'panama': 'PAN',
'papua new guinea': 'PNG',
'paraguay': 'PRY',
'peru': 'PER',
'philippines': 'PHL',
'poland': 'POL',
'portugal': 'PRT',
'puerto rico': 'PRI',
'qatar': 'QAT',
'romania': 'ROU',
'russia': 'RUS',
'rwanda': 'RWA',
'saint kitts and nevis': 'KNA',
'saint lucia': 'LCA',
'saint martin': 'MAF',
'saint pierre and miquelon': 'SPM',
'saint vincent and the grenadines': 'VCT',
'samoa': 'WSM',
'san marino': 'SMR',
'sao tome and principe': 'STP',
'saudi arabia': 'SAU',
'senegal': 'SEN',
'nicaragua': 'NIC',
'nigeria': 'NGA',
'niger': 'NER',
'niue': 'NIU',
'northern mariana islands': 'MNP',
'norway': 'NOR',
'oman': 'OMN',
'pakistan': 'PAK',
'palau': 'PLW',
'panama': 'PAN',
'papua new guinea': 'PNG',
'paraguay': 'PRY',
'peru': 'PER',
'philippines': 'PHL',
'poland': 'POL',
'portugal': 'PRT',
'puerto rico': 'PRI',
'qatar': 'QAT',
'romania': 'ROU',
'russia': 'RUS',
'rwanda': 'RWA',
'saint kitts and nevis': 'KNA',
'saint lucia': 'LCA',
'saint martin': 'MAF',
'saint pierre and miquelon': 'SPM',
'saint vincent and the grenadines': 'VCT',
'samoa': 'WSM',
'san marino': 'SMR',
'sao tome and principe': 'STP',
'saudi arabia': 'SAU',
'senegal': 'SEN',
'uganda': 'UGA',
'ukraine': 'UKR',
'united arab emirates': 'ARE',
'united kingdom': 'GBR',
'united states': 'USA',
'uruguay': 'URY',
'uzbekistan': 'UZB',
'vanuatu': 'VUT',
'venezuela': 'VEN',
'vietnam': 'VNM',
'virgin islands': 'VGB',
'west bank': 'WBG',
'yemen': 'YEM',
'zambia': 'ZMB',
'zimbabwe': 'ZWE'}
## countries
def geoplot(netflix_tiles):
country_with_code, country = {}, {}
shows_countries = ", ".join(netflix_tiles['country'].dropna()).split(", ")
for c,v in dict(Counter(shows_countries)).items():
code = ""
if c.lower() in country_codes:
code = country_codes[c.lower()]
country_with_code[code] = v
country[c] = v
data = [dict(
type = 'choropleth',
locations = list(country_with_code.keys()),
z = list(country_with_code.values()),
color_continuous_scale=px.colors.cyclical.HSV,
autocolorscale = False,
reversescale = True,
marker = dict(
line = dict (
color = 'gray',
width = 0.5
) ),
colorbar = dict(
autotick = False,
title = ''),
) ]
layout = dict(
title = '',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(
type = 'Mercator'
)
)
)
fig = dict( data=data, layout=layout )
iplot( fig, validate=False, filename='d3-world-map' )
return country
country_vals = geoplot(netflix_tiles)
tabs = Counter(country_vals).most_common(25)
labels = [_[0] for _ in tabs][::-1]
values = [_[1] for _ in tabs][::-1]
plot = go.Bar(y=labels, x=values, orientation="h", name="", marker=dict(color="#d73030"))
data = [plot]
layout = go.Layout(title="Countries with most content", height=700, legend=dict(x=0.1, y=1.1, orientation="h"),template='plotly_dark')
fig = go.Figure(data, layout=layout)
fig.show()
year_country2 = netflix.groupby('release_year')['country'].value_counts().reset_index(name='counts')
fig = px.choropleth(year_country2, locations='country', color='counts',
locationmode='country names',
animation_frame='release_year',
range_color=[0,200],
color_continuous_scale=px.colors.sequential.Reds, template='plotly_dark')
fig.update_layout(title='Comparison by country')
fig.show()
netflix_india=netflix_tiles[netflix['country']=='India'].dropna()
netflix_india
netflix_india_shows=netflix_shows[netflix['country']=='India']
oldest_indian_show=netflix_india_shows.sort_values(by='release_year')[0:10]
oldest_indian_show
netflix_india_movies=netflix_movies[netflix['country']=='India']
oldest_indian_movie=netflix_india_movies.sort_values(by='release_year')[0:10]
oldest_indian_movie
Ind_movie_length_min=netflix.loc[(netflix.type=='Movie') & (netflix.country=='India')].duration.str.replace(' min','')
Ind_Average_movie_length= (Ind_movie_length_min.astype(float).mean())
print('Average movie length in India is {:0.2f} min'.format(Ind_Average_movie_length))
def Genre(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
condition = (netflix_tiles.country.str.contains('India'))&(netflix_tiles.type=='TV Show')
column_name ="listed_in"
Genre(netflix_tiles,condition,column_name)
pd.Series(netflix_india.cast.dropna().str.split(', ').sum()).value_counts().head(15)
pd.Series(netflix_india.director.dropna().str.split(', ').sum()).value_counts().head(15)
netflix_indiam =netflix_tiles[netflix['country']=='India']
india_content=netflix_indiam[['type','release_year']]
india_content=india_content.rename(columns={"release_year": "Release Year"})
indian_content=india_content.groupby(['Release Year','type']).size().reset_index(name='Total Content')
indian_content=indian_content[indian_content['Release Year']>=2010]
fig3 = px.line(indian_content, x="Release Year", y="Total Content",color='type', title='Content produced by India over the years on Netflix',color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig3.show()
netflix_usa=netflix_tiles[netflix['country']=='United States']
netflix_usa
netflix_american_movies=netflix_movies[netflix['country']=='United States']
oldest_american_movie=netflix_usa.sort_values(by='release_year')[0:10]
oldest_american_movie
netflix_american_shows=netflix_shows[netflix['country']=='United States']
oldest_american_show=netflix_american_shows.sort_values(by='release_year')[0:10]
oldest_american_show
pd.Series(netflix_usa.cast.dropna().str.split(', ').sum()).value_counts().head(15)
pd.Series(netflix_usa.director.dropna().str.split(', ').sum()).value_counts().head(15)
usa_content=netflix_usa[['type','release_year']]
usa_content=usa_content.rename(columns={"release_year": "Release Year"})
american_content=usa_content.groupby(['Release Year','type']).size().reset_index(name='Total Content')
american_content=american_content[american_content['Release Year']>=2010]
fig3 = px.line(american_content, x="Release Year", y="Total Content",color='type', title='Content produced by USA over the years on Netflix',color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig3.show()
netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')
netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(int)
netflix_movies['duration']
sns.set(style="darkgrid")
sns.kdeplot(data=netflix_movies['duration'], shade=True)
features=['title','duration']
durations= netflix_shows[features]
durations['no_of_seasons']=durations['duration'].str.replace(' Season','')
durations['no_of_seasons']=durations['no_of_seasons'].str.replace('s','')
durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
t=['title','no_of_seasons']
top=durations[t]
top=top.sort_values(by='no_of_seasons', ascending=False)
top20=top[0:20]
fig= px.histogram(top20,x='title',y='no_of_seasons',title='Seasons of Shows',color='no_of_seasons',
color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
features=['title','duration']
durations= netflix_shows[features]
durations['no_of_seasons']=durations['duration'].str.replace(' Season','')
durations['no_of_seasons']=durations['no_of_seasons'].str.replace('s','')
durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
t=['title','no_of_seasons']
top=durations[t]
top=top.sort_values(by='no_of_seasons', ascending=False)
bottom=top.sort_values(by='no_of_seasons')
bottom=bottom[0:20]
bottom
top_duration=pd.value_counts(netflix['duration'])
top_duration
rating_shows=netflix_tiles[netflix_tiles['type']=='TV Show'].dropna()
Rating_shows= rating_shows.groupby(['rating']).size().reset_index(name='counts')
fig= px.pie(Rating_shows, values='counts', names='rating',
title='Distribution of Show Ratings on Netflix',
color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
rating_movies=netflix_tiles[netflix_tiles['type']=='Movie'].dropna()
Rating_movies= rating_movies.groupby(['rating']).size().reset_index(name='counts')
fig= px.pie(Rating_movies, values='counts', names='rating',
title='Distribution of Movie Ratings on Netflix',
color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
genres1=list(netflix_shows['listed_in'])
gen1=[]
for i in genres1:
i=list(i.split(','))
for j in i:
gen1.append(j.replace(' ',""))
g1=Counter(gen1)
g1={k: v for k, v in sorted(g1.items(), key=lambda item: item[1], reverse= True)}
x=list(g1.keys())
y=list(g1.values())
z=[x,y]
fig = px.scatter(z,x,y,size=y,title='Show Genre on Netflix',color_discrete_sequence=px.colors.cyclical.HSV,template='plotly_dark')
fig.show()
def Sci_Fi(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Sci_Fi(netflix_tiles,(netflix_tiles.type=='TV Show') & (netflix_tiles.listed_in.str.contains('Sci-Fi & Fantasy')),'country')
#Anime analysis country-wise(Top most anime watching countries)#
def Anime(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Anime(netflix_tiles,(netflix_tiles.type=='TV Show') & (netflix_tiles.listed_in.str.contains("Anime Series")),'country')
#Kids_show analysis country-wise
def Kids_show(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Kids_show(netflix_tiles,(netflix_tiles.type=='TV Show') & (netflix_tiles.listed_in.str.contains("Kids' TV")),'country')
#Romance genre analysis country-wise
def Romance(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Romance(netflix_tiles,(netflix_tiles.type=='TV Show') & (netflix_tiles.listed_in.str.contains("Romantic TV Shows")),'country')
#Comedy genre analysis country-wise
def Comedy(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Comedy(netflix_tiles,(netflix_tiles.type=='TV Show') & (netflix_tiles.listed_in.str.contains("TV Comedies")),'country')
#International_shows genre analysis country-wise
def International_shows(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
International_shows(netflix_tiles,(netflix_tiles.type=='TV Show') & (netflix_tiles.listed_in.str.contains("International TV Shows")),'country')
genres=list(netflix_movies['listed_in'])
gen=[]
for i in genres:
i=list(i.split(','))
for j in i:
gen.append(j.replace(' ',""))
g=Counter(gen)
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}
x=list(g.keys())
y=list(g.values())
z=[x,y]
fig= px.scatter(z,x,y,size=y,title='Movie Genre on Netflix',color_discrete_sequence=px.colors.cyclical.HSV,template='plotly_dark')
fig.show()
def Stand_up(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Stand_up(netflix_tiles,(netflix_tiles.type=='Movie') & (netflix_tiles.listed_in.str.contains("Stand-Up Comedy")),'country')
def Action_and_adventure_movies(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Action_and_adventure_movies(netflix_tiles,(netflix_tiles.type=='Movie') & (netflix_tiles.listed_in.str.contains("Action & Adventure")),'country')
def Thriller_movies(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Thriller_movies(netflix_tiles,(netflix_tiles.type=='Movie') & (netflix_tiles.listed_in.str.contains("Thriller")),'country')
def Horror_movies(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Horror_movies(netflix_tiles,(netflix_tiles.type=='Movie') & (netflix_tiles.listed_in.str.contains("Horror")),'country')
def Romantic_movies(df,condition,column_name):
result=pd.Series(df.loc[condition][column_name].dropna().str.split(', ').sum()).value_counts()
return result
Romantic_movies(netflix_tiles,(netflix_tiles.type=='Movie') & (netflix_tiles.listed_in.str.contains("Romantic")),'country')
features=['listed_in','country']
a= netflix_tiles[features]
shows_genre= ", ".join(netflix_tiles['listed_in'].dropna()).split(", ")
a['listed_in']=pd.Series(shows_genre)
shows_countries = ", ".join(netflix_tiles['country'].dropna()).split(", ")
a['country']=pd.Series(shows_countries)
b=a.head(150)
dfs = b.groupby('listed_in')['country'].value_counts().unstack().fillna(0).T
fig=px.imshow(dfs,color_continuous_scale=px.colors.sequential.Reds,aspect='auto',title='Distribution of Genres With Respect To Countries',template='plotly_dark',width=1000,height=1000)
fig.show()
sentiments=netflix[['release_year','description']]
for index,row in sentiments.iterrows():
z=row['description']
testimonial=TextBlob(z)
p=testimonial.sentiment.polarity
if p==0:
sent='Neutral'
elif p>0:
sent='Positive'
else:
sent='Negative'
sentiments.loc[[index,2],'Sentiment']=sent
sentiments=sentiments.groupby(['release_year','Sentiment']).size().reset_index(name='Total Content')
sentiments=sentiments[sentiments['release_year']>=2010]
fig = px.bar(sentiments, x="release_year", y="Total Content", color="Sentiment", title="Sentiment of content on Netflix",color_discrete_sequence=px.colors.sequential.Reds_r,template='plotly_dark')
fig.show()
tfidf = TfidfVectorizer(stop_words='english')
netflix['description'] = netflix['description'].fillna('')
tfidf_matrix = tfidf.fit_transform(netflix['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(netflix.index, index=netflix['title']).drop_duplicates()
def get_recommendations(title, cosine_sim=cosine_sim):
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
movie_indices = [i[0] for i in sim_scores]
return netflix['title'].iloc[movie_indices]
recommendation=(input("Enter a movie/show:"))
get_recommendations(recommendation)
filledna=netflix_tiles.fillna('')
def data(x):
return str.lower(x.replace(" ", ""))
features=['title','director','cast','listed_in','description']
filledna=filledna[features]
for feature in features:
filledna[feature] = filledna[feature].apply(data)
def recommendation_aspects(x):
return x['title']+ ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']
filledna['aspects'] = filledna.apply(recommendation_aspects, axis=1)
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna['aspects'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
filledna=filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna['title'])
def get_recommendations_new(title, cosine_sim=cosine_sim):
title=title.replace(' ','').lower()
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:21]
movie_indices = [i[0] for i in sim_scores]
return netflix_tiles['title'].iloc[movie_indices]
recommendation=(input("Enter a movie/show:"))
get_recommendations_new(recommendation, cosine_sim2)